# !pip install dfply
from dfply import *
import networkx as nx
import matplotlib.pyplot as plt
from bokeh.io import show, output_notebook
from bokeh.models import Plot, Range1d, MultiLine, Circle
from bokeh.models.graphs import from_networkx
from bokeh.transform import factor_cmap
from bokeh.transform import linear_cmap
import pandas as pd
from bokeh.palettes import Spectral11, Colorblind
from math import ceil
from math import pow
from bokeh.models import (BoxZoomTool, Circle, HoverTool,PanTool,
MultiLine, Plot, Range1d, ResetTool, NodesAndLinkedEdges,TapTool)
import warnings
warnings.filterwarnings("ignore")
transfers_df = pd.read_pickle("./../../Capstone/Data/Clean/Transfers_Network.pkl")
transfers_df = transfers_df.sort_values("date",ascending = True)
transfers_df = transfers_df.drop_duplicates()
transfers_df = transfers_df.dropna()
def box_cox_normalization(node_size):
size = 0.5
l = 0.5
compressed_point = (pow(node_size, l) - 1) / l
return ceil(size*compressed_point)
def z_score(input_v, avg , sd):
return (input_v - avg) / sd
def edge_color(t_type):
summer, winter, mid = "orange", "black", "red"
if(t_type == "summer"):
return summer
elif(t_type == "winter"):
return winter
else:
return mid
def transfers_detailed(transfers_df):
teams_spent = transfers_df.groupby(['to','season']).apply(lambda x: x.fee.sum())
teams_spent = teams_spent.reset_index(inplace = False, drop = False)
teams_spent.rename({0:"spent"}, axis = "columns", inplace = True)
teams_received = transfers_df.groupby(['from','season']).apply(lambda x: x.fee.sum())
teams_received = teams_received.reset_index(inplace = False, drop = False)
teams_received.rename({0:"received"}, axis = "columns", inplace = True)
leagues_spent = transfers_df.groupby(['to_league','season']).apply(lambda x: x.fee.sum())
leagues_spent = leagues_spent.reset_index(inplace = False, drop = False)
leagues_spent.rename({0:"spent_league"}, axis = "columns", inplace = True)
leagues_received = transfers_df.groupby(['from_league','season']).apply(lambda x: x.fee.sum())
leagues_received = leagues_received.reset_index(inplace = False, drop = False)
leagues_received.rename({0:"received_league"}, axis = "columns", inplace = True)
transfers_df = pd.merge(transfers_df, teams_spent, on = ["to","season"])
transfers_df = pd.merge(transfers_df, teams_received, on = ["from","season"])
transfers_df = pd.merge(transfers_df, leagues_spent, on = ["to_league","season"])
transfers_df = pd.merge(transfers_df, leagues_received, on = ["from_league","season"])
return transfers_df
def nodes_attributes(transfers_df):
from_info = transfers_df[["from","from_country","from_cont","from_league","from_league_class"]].drop_duplicates("from")
from_info.rename(lambda x: x.replace("from_",""), axis = "columns", inplace = True)
from_info.rename({"from":"club"}, axis = "columns", inplace = True)
to_info = transfers_df[["to","to_country","to_cont","to_league","to_league_class"]].drop_duplicates("to")
to_info.rename(lambda x: x.replace("to_",""), axis = "columns", inplace = True)
to_info.rename({"to":"club"}, axis = "columns", inplace = True)
nodes_attr = pd.merge(from_info, to_info, on = ["club","country","cont","league","league_class"]).drop_duplicates()
from_received = (transfers_df >>
group_by(X["from"]) >>
summarize( received = X.received.sum()
))
from_received.rename({"from":"club"}, axis = "columns", inplace = True)
to_spent = (transfers_df >>
group_by(X["to"]) >>
summarize( spent = X.spent.sum()
))
to_spent.rename({"to":"club"}, axis = "columns", inplace = True)
spent_received = pd.merge(from_received, to_spent, on = ["club"]).drop_duplicates()
nodes_attr = pd.merge(nodes_attr, spent_received, on = ["club"])
nodes_attr['profit'] = nodes_attr['received'] - nodes_attr['spent']
return nodes_attr
def set_nodes_attributes(G, nodes_attr):
nodes_cont = dict(zip(nodes_attr.club, nodes_attr.cont))
nodes_country = dict(zip(nodes_attr.club, nodes_attr.country))
nodes_league = dict(zip(nodes_attr.club, nodes_attr.league))
nodes_league_class = dict(zip(nodes_attr.club, nodes_attr.league_class))
nodes_received = dict(zip(nodes_attr.club, nodes_attr.received))
nodes_spent = dict(zip(nodes_attr.club, nodes_attr.spent))
nodes_profit = dict(zip(nodes_attr.club, nodes_attr.profit))
names = dict(map(lambda node: (node[0], (node[0])), dict(G.degree).items()))
nx.set_node_attributes(G, nodes_cont, 'continent')
nx.set_node_attributes(G, nodes_country, 'country')
nx.set_node_attributes(G, nodes_league, 'league')
nx.set_node_attributes(G, nodes_received, 'received')
nx.set_node_attributes(G, nodes_spent, 'spent')
nx.set_node_attributes(G, nodes_profit, 'profit')
nx.set_node_attributes(G, nodes_league_class, 'league_class')
nx.set_node_attributes(G, names, 'name')
new_sizes = dict(map(lambda node: (node[0], box_cox_normalization(node[1])), dict(G.degree).items()))
degrees = dict(map(lambda node: (node[0], (node[1])), dict(G.degree).items()))
fees_dict = dict(nx.get_edge_attributes(G,"fee")).items()
ages_dict = dict(nx.get_edge_attributes(G,"age")).items()
types_dict = dict(nx.get_edge_attributes(G,"type")).items()
fees = np.array(list(dict(nx.get_edge_attributes(G,"fee")).values()))
ages = np.array(list(dict(nx.get_edge_attributes(G,"age")).values()))
avg_fee = np.mean(fees)
sd_fee = np.std(fees)
new_fees = dict(map(lambda edge: ( edge[0], z_score(edge[1], avg_fee, sd_fee) ), fees_dict ))
new_ages = dict(map(lambda edge: ( edge[0], z_score(edge[1], avg_fee, sd_fee) ), fees_dict ))
edge_colors = dict(map(lambda edge: ( edge[0], edge_color(edge[1]) ), types_dict ))
nx.set_node_attributes(G, dict(G.degree), 'connections')
nx.set_node_attributes(G, new_sizes, 'node_size')
nx.set_node_attributes(G, degrees, 'degree')
nx.set_edge_attributes(G, new_fees, 'edge_width')
nx.set_edge_attributes(G, new_ages, 'edge_alpha')
nx.set_edge_attributes(G, edge_colors, 'edge_color')
nx.set_node_attributes(G,dict(G.out_degree()),"outgoing_edges")
nx.set_node_attributes(G,dict(G.in_degree()),"incoming_edges")
return G
def group_league(transfers_df):
by_leagues = (transfers_df >>
group_by(X.from_league, X.to_league, X["type"]) >>
summarize(fee = X.fee.sum(), count = X.index.nunique(),
age = X.age.mean(),
from_country = X.from_country.unique()[0],
from_league_class = X.from_league_class.unique()[0],
from_cont = X.from_cont.unique()[0],
from_league_received = X.received_league.sum(),
to_country = X.to_country.unique()[0],
to_league_class = X.to_league_class.unique()[0],
to_cont = X.to_cont.unique()[0],
to_league_spent = X.spent_league.sum(),
))
return by_leagues
def league_node_attrs(by_leagues):
from_league_info = (by_leagues >>
group_by(X.from_league) >>
summarize(
country = X.from_country.unique()[0],
cont = X.from_cont.unique()[0],
league_class = X.from_league_class.unique()[0],
received = X.from_league_received.unique()[0],
))
from_league_info.rename({"from_league":"league"},axis = "columns", inplace = True)
to_league_info = (by_leagues >>
group_by(X.to_league) >>
summarize(
country = X.to_country.unique()[0],
cont = X.to_cont.unique()[0],
league_class = X.to_league_class.unique()[0],
spent = X.to_league_spent.unique()[0],
))
to_league_info.rename({"to_league":"league"},axis = "columns", inplace = True)
node_attrs = pd.merge(from_league_info,to_league_info, on = ["league","league_class","country","cont"])
node_attrs["profit"] = node_attrs["received"] - node_attrs["spent"]
return node_attrs
def league_attributes(G, node_attrs):
node_names = dict(map(lambda node: (node[0], (node[0])), dict(G.degree).items()))
node_country = dict(zip(node_attrs.league, node_attrs.country))
node_cont = dict(zip(node_attrs.league, node_attrs.cont))
node_league_class = dict(zip(node_attrs.league, node_attrs.league_class))
node_received = dict(zip(node_attrs.league, node_attrs.received))
node_spent = dict(zip(node_attrs.league, node_attrs.spent))
node_profit = dict(zip(node_attrs.league, node_attrs.profit))
new_sizes = dict(map(lambda node: (node[0], box_cox_normalization(node[1])), dict(G.degree).items()))
degrees = dict(map(lambda node: (node[0], node[1]), dict(G.degree).items()))
nx.set_node_attributes(G, node_names, "name")
nx.set_node_attributes(G, node_country, "country")
nx.set_node_attributes(G, node_cont, "continent")
nx.set_node_attributes(G, node_league_class, "league_class")
nx.set_node_attributes(G, node_received, "received")
nx.set_node_attributes(G, node_spent, "spent")
nx.set_node_attributes(G, node_profit, "profit")
nx.set_node_attributes(G, degrees,'degree')
nx.set_node_attributes(G,dict(G.out_degree()),"outgoing_edges")
nx.set_node_attributes(G,dict(G.in_degree()),"incoming_edges")
types_dict = dict(nx.get_edge_attributes(G,"type")).items()
edge_colors = dict(map(lambda edge: ( edge[0], edge_color(edge[1]) ), types_dict))
nx.set_edge_attributes(G, edge_colors, 'edge_color')
fees = np.array(list(dict(nx.get_edge_attributes(G,"fee")).values()))
avg_fee = np.mean(fees)
sd_fee = np.std(fees)
fees_dict = dict(nx.get_edge_attributes(G,"fee")).items()
ages_dict = dict(nx.get_edge_attributes(G,"age")).items()
nx.set_node_attributes(G, new_sizes, 'node_size')
new_fees = dict(map(lambda edge: ( edge[0], z_score(edge[1], avg_fee, sd_fee) ), fees_dict ))
new_ages = dict(map(lambda edge: ( edge[0], z_score(edge[1], avg_fee, sd_fee) ), fees_dict ))
nx.set_edge_attributes(G, new_fees, 'edge_width')
nx.set_edge_attributes(G, new_ages, 'edge_alpha')
nx.set_node_attributes(G, dict(G.degree), 'connections')
return G
def get_subgraph(G, attr, value):
filtered = [x for x,y in transfers_G.nodes(data=True) if y[attr] in (value)]
subgraph = transfers_G.subgraph(filtered)
return subgraph
def plot_net(G,title,**kwargs):
node_color = kwargs["node_color"]
colors = list(set(nx.get_node_attributes(G,node_color).values()))
node_size = kwargs["node_size"]
hover_data = []
edge_alpha = kwargs["edge_alpha"]
edge_width = kwargs["edge_width"]
edge_color = kwargs["edge_color"]
for attr in kwargs["hover_data"]:
name = "@"+attr
hover_data.append((attr, name))
plot = Plot(plot_width=700, plot_height=500,
x_range = Range1d(-1.1, 1.1), y_range=Range1d(-1.1, 1.1))
plot.title.text = title
node_hover_tool = HoverTool(tooltips = hover_data)
plot.add_tools(node_hover_tool, BoxZoomTool(), ResetTool(), PanTool(),TapTool())
layout = kwargs["layout"]
graph_renderer = from_networkx(G, layout, scale=1, center=(0, 0))
graph_renderer.node_renderer.glyph = Circle(size = node_size, fill_color = factor_cmap(node_color,Colorblind[7], colors))
graph_renderer.edge_renderer.glyph = MultiLine(
line_alpha = edge_alpha,
line_width = edge_width,
line_color = edge_color
)
graph_renderer.edge_renderer.selection_glyph = MultiLine(line_color='blue', line_width=edge_width)
graph_renderer.selection_policy = NodesAndLinkedEdges()
plot.renderers.append(graph_renderer)
output_notebook()
show(plot)
def call_plot(G,title,layout,node_size = "node_size", node_color = "continent", edge_color = "edge_color",
edge_alpha = "edge_alpha", edge_width = "edge_width"):
#
hover_data = ["connections","country","continent","league_class","name","spent","received","profit","incoming_edges","outgoing_edges"]
plot_net(G,title,node_size = node_size, node_color = node_color, edge_color = edge_color,
edge_alpha = edge_alpha, edge_width = edge_width, hover_data = hover_data, layout = layout)
def network_stats(G):
print("Network's density")
print(nx.density(G))
print("Network's reciprocity")
print(nx.reciprocity(G))
print("Network's assortavity based on continent")
print(nx.attribute_assortativity_coefficient(G,'continent'))
print("Network's assortavity based on league_class")
print(nx.attribute_assortativity_coefficient(G,'league_class'))
print("Network's assortavity based on country")
print(nx.attribute_assortativity_coefficient(G,'country'))
print("Network's assortavity based on degrees")
print(nx.degree_assortativity_coefficient(G))
def network_info(G):
out_df = pd.DataFrame.from_dict(dict(list(G.out_degree)), orient = "index").reset_index()
out_df.columns = ['club','outgoing_count']
print("outgoing edges")
print(out_df.sort_values("outgoing_count",ascending = False))
in_df = pd.DataFrame.from_dict(dict(list(G.in_degree)), orient = "index").reset_index()
in_df.columns = ['club','incoming_count']
print("incoming edges")
print(in_df.sort_values("incoming_count",ascending = False))
degree_df = pd.DataFrame.from_dict(nx.degree_centrality(G),orient = "index").reset_index()
degree_df.columns = ['club','centrality']
print("degree centrality")
print(degree_df.sort_values('centrality',ascending = False))
in_degree_df = pd.DataFrame.from_dict(nx.in_degree_centrality(G),orient = "index").reset_index()
in_degree_df.columns = ['club','in_centrality']
print("incoming degree centrality")
print(in_degree_df.sort_values('in_centrality',ascending = False))
out_degree_df = pd.DataFrame.from_dict(nx.out_degree_centrality(G),orient = "index").reset_index()
out_degree_df.columns = ['club','out_centrality']
print("outgoing degree centrality")
print(out_degree_df.sort_values('out_centrality',ascending = False))
profit_df = pd.DataFrame.from_dict(nx.get_node_attributes(G, "profit"), orient = "index").reset_index()
profit_df.columns = ['club','profit']
print("Profit made by teams")
print(profit_df.sort_values('profit',ascending = False))
edges_cols = ['loan','type','fee','mv','name','continent','nationality','main_field_position',
'field_position','age','season','date']
transfers_df = transfers_detailed(transfers_df)
nodes_attr = nodes_attributes(transfers_df)
clubs = nodes_attr.club.unique()
transfers_df = transfers_df[transfers_df["from"].isin(clubs) & transfers_df["to"].isin(clubs)]
transfers_G = nx.from_pandas_edgelist(transfers_df, 'from','to', edges_cols, create_using=nx.MultiDiGraph)
transfers_G = set_nodes_attributes(transfers_G, nodes_attr)
remove = [node for node,degree in dict(transfers_G.degree()).items() if degree < 18]
transfers_G.remove_nodes_from(remove)
call_plot(transfers_G, "Soccer Transfers Network", nx.spring_layout)
As the networks contains a lot of nodes, we cannot get much information from the visualization. However we can see that the european teams dominate in the market, with majority of them connected with each other, and also we can see that after europen teams, and majority of the transfers happen during the summer transfer window, except for some Asian teams, whick make most of their deals during the winter transfer window.
from networkx.algorithms import community
communities_generator = community.girvan_newman(transfers_G)
next_level_communities = next(communities_generator)
next_1_level_communities = next(communities_generator)
next_2_level_communities = next(communities_generator)
next_3_level_communities = next(communities_generator)
next_4_level_communities = next(communities_generator)
next_5_level_communities = next(communities_generator)
len(next_5_level_communities)
i = 1
for community in next_5_level_communities:
for node in community:
transfers_G.nodes()[node]["community"] = str("community") + str(i)
i = i + 1
call_plot(transfers_G, "Soccer Transfers Network Communities", nx.spring_layout, node_color = "community")
network_stats(transfers_G)
As we can see the networks density is very low, which is logical as we have many teams, and not all of them have connections between each other. However the reciprocity of the network is relatively high, as most of the teams that make deals with each other have transfers in opposite directions too. The main attributes for the assortiativity of the teams is their continent and country, as it is easier for player's to move to another team that is in the same continent where they play, and even more when it happens in the same country. The metric is around 0.5, as most of the talented players from other continents and non EU countries tend to move to european soccer clubs, as there they have higher chances of succeeding. League's class has the lowest effect on the assortiativity as most of the time players from leagues with lower ranking tend to move to higher ranked leagues. Degree of the node also has relatively low connection to the assortativity of the nodes, as teams with a low number of connections not always are connected to teams with a lot of connections.
network_info(transfers_G)
We can see that the most central team of the network is Chelsea, and top 5 contains other Italian teams, and the lowest centrality is among not popular teams. Almost the same situation for in degree and out degree centrality. Finally we can see that most that the profitable teams are Benfica, Porto and Ajax, as they are famous in the whole world for rising and selling young talents, whereas the top teams such as Man City and Barcelona are on lowest places in terms of making profits from transfers.
loans_df = transfers_df[transfers_df.loan == True]
# loans_df = transfers_detailed(loans_df)
nodes_attr = nodes_attributes(loans_df)
clubs = nodes_attr.club.unique()
loans_df = loans_df[loans_df["from"].isin(clubs) & loans_df["to"].isin(clubs)]
loans_G = nx.from_pandas_edgelist(loans_df, 'from','to', edges_cols, create_using=nx.MultiDiGraph)
loans_G = set_nodes_attributes(loans_G, nodes_attr)
remove = [node for node,degree in dict(loans_G.degree()).items() if degree < 15]
loans_G.remove_nodes_from(remove)
transfers_df
call_plot(loans_G, "Soccer Loans' Network", nx.spring_layout)
The situation is almost the same situation in terms of main teams of the network. However we can see that, more young players are involved in loans taking place during the winter transfer window(black color for window, low opacity for young players). Also we can see two teams from South Korea that are isolated
network_stats(loans_G)
Almost the same metrics as for transfers network, except almost maximal value for reciprocity, which is logical as in most of the cases player who is loaned to another club comes back to his club, and only in some cases the club that loaned the player buys him.
by_leagues = group_league(transfers_df)
node_attrs = league_node_attrs(by_leagues)
leagues_G = nx.from_pandas_edgelist(by_leagues, 'from_league','to_league', ['fee','count','age','type'], create_using=nx.MultiDiGraph)
leagues_G = league_attributes(leagues_G, node_attrs)
call_plot(leagues_G, "Soccer Transfers' Network by Leagues", nx.spring_layout, node_size = "node_size")
If we take the leagues as nodes, we can see that teams form the South Korean league have the lowest connection to the other leagues' teams. In general the graph is very interconnected and almost all of the leagues have direct links to each other.
network_stats(leagues_G)
As we can see the density of the network is very high, and as it is a multigraph, where two nodes can have more than one edge between them. As we have taken the league as a node, the reciprocity is very high compared to the graph, where the nodes were the teams. In terms of assortiativity the highest effect has the continent of the leagues, and degree of the node has the lowest effect, which is even negative.
network_info(leagues_G)
The main member of the network is Netherland's league in terms of outgoing transfers. Championship premier league and mls have the highest number of incoming edges. Eredivisie has the highest centrality measures and Championship has the highest in degree centrality measure. Bundesliga makes the most profit in transfers, while Premier League teams spend much more than they receive in transfers.
leagues_loans = transfers_df[transfers_df.loan]
by_leagues = group_league(leagues_loans)
node_attrs = league_node_attrs(by_leagues)
leagues_loans_G = nx.from_pandas_edgelist(by_leagues, 'from_league','to_league', ['fee','count','age','type'], create_using=nx.MultiDiGraph)
leagues_loans_G = league_attributes(leagues_loans_G, node_attrs)
call_plot(leagues_loans_G, "Soccer's Loans Network by leagues", nx.spring_layout, node_size = 'node_size')
The loans network is very similar to the transfers network but is less dense.
network_stats(leagues_loans_G)
The main difference in the metrics except lower density and reciprocity, is that the assortiativity degree based on country is higher, as most of the teams loan out their players to lower leagues in their country, so that they can gain gaming practice.
network_info(leagues_loans_G)
Premier league has the highest centrality measure, while Bundesliga and Championship have the highest out degree and in degree centralities accordingly. Championship made the highest profit in loans and Premier League the lowest.
winter_t = transfers_df[transfers_df["type"] == "winter"]
by_leagues_s = group_league(winter_t)
node_attrs = league_node_attrs(by_leagues_s)
winter_leagues_G = nx.from_pandas_edgelist(by_leagues_s, 'from_league','to_league', ['fee','count','age','type'], create_using=nx.MultiDiGraph)
winter_leagues_G = league_attributes(winter_leagues_G, node_attrs)
call_plot(winter_leagues_G, "Winter Transfers Network by leagues", nx.spring_layout, node_size = "incoming_edges")
We can see that the Asian clubs are most active memebers of the winter transfer window being connected to almost all of the other leagues. Chinese league alongside with MlS and Spanish league are the most active member of the transfer market's winter window in terms of buying players.
network_stats(winter_leagues_G)
The network is not very dense in comparison with transfers during all windows, the main attribute for assortiativity degree is continent, and degree of the node has negative effect on it's assortiativity.
network_info(winter_leagues_G)
The central member in almost all measures is Chinese super league, as most of the clubs there got richer in a winter period and made a lot of expensive transfers in the winter transfer window.
league_subgraph = get_subgraph(leagues_G, "country", "England")
network_stats(league_subgraph)
network_info(league_subgraph)
#@title **Network by country**
country = "Italy" #@param ['Japan','Italy','Belgium','Spain','Mexico','Sweden','Saudi Arabia','Turkey','Brazil','Denmark','Netherlands','Portugal','France','Colombia','Germany','China','Norway','Argentina','Poland','England','Scotland','United States','Australia','Korea, South']
league_subgraph = get_subgraph(leagues_G, "country", country)
title = "Spanish league transfers network" #@param string
call_plot(league_subgraph, title, nx.circular_layout, node_color = "name")
!pip install node2vec
from node2vec import Node2Vec
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import plotly.express as px
model = Node2Vec(transfers_G, dimensions=16, p=1, q=2, num_walks=200, walk_length=6)
embeddings = model.fit()
def plot_sim_nodes(embeddings, nodes):
arrays = np.empty((0, 16), dtype='f')
nodes_array = np.array([])
sim_nodes_array = np.array([])
scores_array = np.array([])
for node in nodes:
sim_nodes = embeddings.most_similar(node)
for sim_node in sim_nodes:
close_node = list(sim_node)[0]
arrays = np.append(arrays, embeddings[close_node].reshape(1,16), axis=0)
nodes_array = np.append(nodes_array, [node])
sim_nodes_array = np.append(sim_nodes_array, [close_node])
scores_array = np.append(scores_array, list(sim_node)[1])
Y = PCA(n_components=2).fit_transform(arrays)
df = pd.DataFrame({'x': [x for x in Y[:, 0]],
'y': [y for y in Y[:, 1]],
'sim_node': sim_nodes_array,
'node': nodes_array,
'score' : scores_array
})
fig = px.scatter(df, "x", "y", hover_data=['sim_node',"node", "score"],
color = 'node', text = "sim_node", labels = {"x":"", "y":""},
title = "Similar teams based on the network",)
fig.update_traces(textposition='top center')
fig.show()
plot_sim_nodes(embeddings, ['FC Porto','Benfica'])